import requests
from bs4 import BeautifulSoup
from dbUtils import DBTool  # 导入第一部分的数据库工具类

# 爬取函数
def scrawler(url, selector, flag):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0'
    }
    r = requests.get(url, headers=headers)
    codes = r.text
    bs = BeautifulSoup(codes, 'html.parser')
    if flag == 0:
        return [item.text.strip() for item in bs.select(selector=selector)]
    elif flag == 1:
        return [item.attrs['href'] for item in bs.select(selector=selector)]

if __name__ == '__main__':
    # 开始调用函数进行爬取
    url = "https://arxiv.org/abs/2409.00580"
    db = DBTool()

    # 爬取数据
    title = scrawler(url, "#abs > h1", flag=0)[0]
    author = ', '.join(scrawler(url, "#abs > div.authors > a:nth-child(2)", flag=0))  # 将作者列表合并成一个字符串
    abstract = scrawler(url, "#abs > blockquote", flag=0)[0]
    pdfurl = scrawler(url, "#abs > a:nth-child(5)", flag=1)[0]
    pub_date = scrawler(url, "#abs-outer > div.leftcolumn > div.submission-history", flag=0)[0]

    # 插入数据库
    if db.insert(title, author, abstract, pdfurl, pub_date):
        print("数据插入成功")
        print(db.queryAll())
